This report explores possible visualization solution for logistic regression modeling.
This report is a record of interaction with a data transfer object (dto) produced by
./manipulation/0-ellis-island.R.
The next section recaps this script, exposes the architecture of the DTO, and demonstrates the language of interacting with it.
All data land on Ellis Island.
The script 0-ellis-island.R is the first script in the analytic workflow. It accomplished the following:
./data/shared/derived/meta-data-live.csv, which is updated every time Ellis Island script is executed../data/shared/meta-data-map.csv. They are used by automatic scripts in later harmonization and analysis.# load the product of 0-ellis-island.R, a list object containing data and metadata
dto <- readRDS("./data/unshared/derived/dto.rds")# the list is composed of the following elements
names(dto)[1] "studyName" "filePath" "unitData" "metaData"
# 1st element - names of the studies as character vector
dto[["studyName"]][1] "alsa" "lbsl" "satsa" "share" "tilda"
# 2nd element - file paths of the data files for each study as character vector
dto[["filePath"]][1] "./data/unshared/raw/ALSA-Wave1.Final.sav" "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
[3] "./data/unshared/raw/SATSA-Q3.Final.sav" "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
[5] "./data/unshared/raw/TILDA-Wave1.Final.sav"
# 3rd element - is a list object containing the following elements
names(dto[["unitData"]])[1] "alsa" "lbsl" "satsa" "share" "tilda"
# each of these elements is a raw data set of a corresponding study, for example
dplyr::tbl_df(dto[["unitData"]][["lbsl"]]) Source: local data frame [656 x 38]
id AGE94 SEX94 MSTAT94 EDUC94 NOWRK94 SMK94 SMOKE
(int) (int) (int) (fctr) (int) (fctr) (fctr) (fctr)
1 4001026 68 1 divorced 16 no, retired no never smoked
2 4012015 94 2 widowed 12 no, retired no never smoked
3 4012032 94 2 widowed 20 no, retired no don't smoke at present but smoked in the past
4 4022004 93 2 NA NA NA NA never smoked
5 4022026 93 2 widowed 12 no, retired no never smoked
6 4031031 92 1 married 8 no, retired no don't smoke at present but smoked in the past
7 4031035 92 1 widowed 13 no, retired no don't smoke at present but smoked in the past
8 4032201 92 2 NA NA NA NA don't smoke at present but smoked in the past
9 4041062 91 1 widowed 7 NA no don't smoke at present but smoked in the past
10 4042057 91 2 NA NA NA NA NA
.. ... ... ... ... ... ... ... ...
Variables not shown: ALCOHOL (fctr), WINE (int), BEER (int), HARDLIQ (int), SPORT94 (int), FIT94 (int), WALK94 (int),
SPEC94 (int), DANCE94 (int), CHORE94 (int), EXCERTOT (int), EXCERWK (int), HEIGHT94 (int), WEIGHT94 (int), HWEIGHT
(int), HHEIGHT (int), SRHEALTH (fctr), smoke_now (lgl), smoked_ever (lgl), year_of_wave (dbl), age_in_years (dbl),
year_born (dbl), female (lgl), marital (chr), educ3 (chr), current_work_2 (lgl), current_drink (lgl), sedentary
(lgl), poor_health (lgl), bmi (dbl)
# 4th element - a dataset names and labels of raw variables + added metadata for all studies
dto[["metaData"]] %>% dplyr::select(study_name, name, item, construct, type, categories, label_short, label) %>%
DT::datatable(
class = 'cell-border stripe',
caption = "This is the primary metadata file. Edit at `./data/shared/meta-data-map.csv",
filter = "top",
options = list(pageLength = 6, autoWidth = TRUE)
)dmls <- list() # dummy list
for(s in dto[["studyName"]]){
ds <- dto[["unitData"]][[s]] # get study data from dto
(varnames <- names(ds)) # see what variables there are
(get_these_variables <- c("id",
"year_of_wave","age_in_years","year_born",
"female",
"marital",
"educ3",
"smoke_now","smoked_ever",
"current_work_2",
"current_drink",
"sedentary",
"poor_health",
"bmi"))
(variables_present <- varnames %in% get_these_variables) # variables on the list
dmls[[s]] <- ds[,variables_present] # keep only them
}
lapply(dmls, names) # view the contents of the list object$alsa
[1] "id" "smoke_now" "smoked_ever" "year_of_wave" "age_in_years" "year_born"
[7] "female" "marital" "educ3" "current_work_2" "current_drink" "sedentary"
[13] "poor_health" "bmi"
$lbsl
[1] "id" "smoke_now" "smoked_ever" "year_of_wave" "age_in_years" "year_born"
[7] "female" "marital" "educ3" "current_work_2" "current_drink" "sedentary"
[13] "poor_health" "bmi"
$satsa
[1] "id" "smoke_now" "smoked_ever" "year_of_wave" "age_in_years" "year_born"
[7] "female" "marital" "educ3" "current_work_2" "current_drink" "sedentary"
[13] "poor_health" "bmi"
$share
[1] "id" "smoke_now" "smoked_ever" "year_of_wave" "year_born" "age_in_years"
[7] "female" "marital" "educ3" "current_work_2" "current_drink" "sedentary"
[13] "poor_health" "bmi"
$tilda
[1] "id" "smoke_now" "smoked_ever" "year_of_wave" "age_in_years" "year_born"
[7] "female" "marital" "educ3" "current_work_2" "current_drink" "sedentary"
[13] "poor_health" "bmi"
ds <- plyr::ldply(dmls,data.frame,.id = "study_name")
ds$id <- 1:nrow(ds) # some ids values might be identical, replace
head(ds) study_name id smoke_now smoked_ever year_of_wave age_in_years year_born female marital educ3
1 alsa 1 FALSE FALSE 1992 86 1906 FALSE mar_cohab more than high school
2 alsa 2 FALSE FALSE 1992 78 1914 TRUE mar_cohab high school
3 alsa 3 FALSE FALSE 1992 89 1903 TRUE widowed high school
4 alsa 4 FALSE FALSE 1992 78 1914 FALSE widowed high school
5 alsa 5 FALSE FALSE 1992 85 1907 FALSE widowed more than high school
6 alsa 6 FALSE FALSE 1992 92 1900 TRUE widowed high school
current_work_2 current_drink sedentary poor_health bmi
1 FALSE TRUE FALSE FALSE NA
2 FALSE TRUE FALSE FALSE NA
3 FALSE TRUE TRUE FALSE NA
4 TRUE TRUE FALSE TRUE NA
5 FALSE TRUE FALSE FALSE NA
6 FALSE TRUE TRUE FALSE NA
# age summary across studies
ds %>%
dplyr::group_by(study_name) %>%
na.omit() %>%
dplyr::summarize(mean_age = round(mean(age_in_years),1),
sd_age = round(sd(age_in_years),2),
observed = n(),
min_born = min(year_born),
med_born = median(year_born),
max_born = max(year_born)) Source: local data frame [4 x 7]
study_name mean_age sd_age observed min_born med_born max_born
(fctr) (dbl) (dbl) (int) (dbl) (dbl) (dbl)
1 lbsl 68.0 13.11 516 1900 1925 1964
2 satsa 63.1 12.67 1283 1900 1925 1998
3 share 62.0 9.97 2481 1911 1944 1967
4 tilda 62.6 9.12 4531 1929 1947 1960
# see counts across age groups and studies
t <- table(
cut(ds$age_in_years,breaks = c(-Inf,seq(from=40,to=100,by=5), Inf)),
ds$study_name,
useNA = "always"
); t[t==0] <- "."; t
alsa lbsl satsa share tilda <NA>
(-Inf,40] . 25 73 8 . .
(40,45] . 25 69 39 . .
(45,50] . 30 114 221 663 .
(50,55] . 45 162 526 1637 .
(55,60] . 28 126 489 1590 .
(60,65] 13 87 168 361 1388 .
(65,70] 258 101 222 389 1138 .
(70,75] 552 81 235 263 884 .
(75,80] 513 67 198 162 1192 .
(80,85] 425 110 96 98 . .
(85,90] 254 43 28 33 . .
(90,95] 58 13 4 6 . .
(95,100] 12 1 1 . . .
(100, Inf] 2 . . . . .
<NA> . . 1 3 12 .
# basic counts
table(ds$study_name, ds$smoke_now, useNA = "always")
FALSE TRUE <NA>
alsa 1851 217 19
lbsl 480 71 105
satsa 1067 365 65
share 2186 408 4
tilda 6939 1564 1
<NA> 0 0 0
table(ds$study_name, ds$smoked_ever, useNA = "always")
FALSE TRUE <NA>
alsa 1851 217 19
lbsl 207 351 98
satsa 700 696 101
share 1542 1052 4
tilda 3726 4777 1
<NA> 0 0 0
table(ds$study_name, ds$female, useNA = "always")
FALSE TRUE <NA>
alsa 1056 1031 0
lbsl 314 342 0
satsa 610 887 0
share 1139 1459 0
tilda 3780 4724 0
<NA> 0 0 0
table(ds$study_name, ds$marital, useNA = "always")
mar_cohab sep_divorced single widowed <NA>
alsa 1367 49 76 594 1
lbsl 326 77 22 134 97
satsa 961 113 149 259 15
share 2049 159 51 336 3
tilda 5966 552 791 1195 0
<NA> 0 0 0 0 0
table(ds$study_name, ds$educ3, useNA = "always")
high school less than high school more than high school <NA>
alsa 819 337 905 26
lbsl 170 74 310 102
satsa 121 1239 109 28
share 889 965 717 27
tilda 2795 5222 483 4
<NA> 0 0 0 0
d <- ds %>%
dplyr::select_("id", "study_name", "smoke_now",
"age_in_years", "female", "marital", "educ3","poor_health") %>%
na.omit()
mdl <- glm(
formula = smoke_now ~ -1 + study_name + age_in_years + female + marital + educ3 + poor_health,
data = d, family = "binomial"
);summary(mdl)
Call:
glm(formula = smoke_now ~ -1 + study_name + age_in_years + female +
marital + educ3 + poor_health, family = "binomial", data = d)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4099 -0.6540 -0.5434 -0.4232 2.5534
Coefficients:
Estimate Std. Error z value Pr(>|z|)
study_namealsa 0.99389 0.20694 4.803 1.57e-06 ***
study_namelbsl 0.71123 0.21340 3.333 0.00086 ***
study_namesatsa 1.13828 0.16995 6.698 2.11e-11 ***
study_nameshare 0.70152 0.16513 4.248 2.15e-05 ***
study_nametilda 0.85954 0.15958 5.386 7.20e-08 ***
age_in_years -0.04168 0.00255 -16.342 < 2e-16 ***
femaleTRUE -0.17909 0.04534 -3.950 7.81e-05 ***
maritalsep_divorced 0.65913 0.07874 8.371 < 2e-16 ***
maritalsingle 0.24404 0.08254 2.956 0.00311 **
maritalwidowed 0.37548 0.06988 5.374 7.72e-08 ***
educ3less than high school 0.21996 0.05219 4.215 2.50e-05 ***
educ3more than high school -0.30567 0.07903 -3.868 0.00011 ***
poor_healthTRUE 0.35032 0.04857 7.213 5.49e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 20836 on 15030 degrees of freedom
Residual deviance: 13256 on 15017 degrees of freedom
AIC: 13282
Number of Fisher Scoring iterations: 4
d$smoke_now_p <- predict(mdl)graph_logistic_point_complex_4(
ds = d,
x_name = "age_in_years",
y_name = "smoke_now_p",
covar_order = c("female","marital","educ3","poor_health"),
alpha_level = .3) graph_logitstic_curve_complex_4(
ds = d,
x_name = "age_in_years",
y_name = "smoke_now",
covar_order = c("female","marital","educ3","poor_health"),
alpha_level = .3) model_outcome <- "smoke_now"
model_predictors <- c("age_in_years", "female", "marital", "educ3","poor_health")
ml <- list() # create a model list object to contain model estimation and modeled data
for(s in dto[["studyName"]]){
d <- dto[['unitData']][[s]] %>%
dplyr::select_(.dots=c("id",model_outcome, model_predictors)) %>%
na.omit()
mdl <- stats::glm( # fit model
formula = smoke_now ~ age_in_years +female + marital + educ3 + poor_health ,
data = d, family="binomial"
); summary(mdl);
modeled_response_name <- paste0(model_outcome,"_p")
d[,modeled_response_name] <- predict(mdl)
ml[["data"]][[s]] <- d
ml[["model"]][[s]] <- mdl
}
# names(ml[["data"]])
# names(ml[["model"]])
d <- plyr::ldply(ml[["data"]],data.frame,.id = "study_name")
d$id <- 1:nrow(d) # some ids values might be identical, replace
head(d) study_name id smoke_now age_in_years female marital educ3 poor_health smoke_now_p
1 alsa 1 FALSE 86 FALSE mar_cohab more than high school FALSE -2.484083
2 alsa 2 FALSE 78 TRUE mar_cohab high school FALSE -2.685699
3 alsa 3 FALSE 89 TRUE widowed high school FALSE -2.988931
4 alsa 4 FALSE 78 FALSE widowed high school TRUE -1.711330
5 alsa 5 FALSE 85 FALSE widowed more than high school FALSE -2.130293
6 alsa 6 FALSE 92 TRUE widowed high school FALSE -3.153186
graph_logistic_point_complex_4(
ds = d,
x_name = "age_in_years",
y_name = "smoke_now_p",
covar_order = c("female","marital","educ3","poor_health"),
alpha_level = .3) graph_logitstic_curve_complex_4(
ds = d,
x_name = "age_in_years",
y_name = "smoke_now",
covar_order = c("female","marital","educ3","poor_health"),
alpha_level = .3)